## Company_Data_Cleaning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv('/Users/cactusjack/Downloads/general_data.csv')
df.head()
| Age | Attrition | BusinessTravel | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeID | Gender | ... | NumCompaniesWorked | Over18 | PercentSalaryHike | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | YearsAtCompany | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | 1 | 1 | Female | ... | 1.0 | Y | 11 | 8 | 0 | 1.0 | 6 | 1 | 0 | 0 |
| 1 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | 1 | 2 | Female | ... | 0.0 | Y | 23 | 8 | 1 | 6.0 | 3 | 5 | 1 | 4 |
| 2 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | 1 | 3 | Male | ... | 1.0 | Y | 15 | 8 | 3 | 5.0 | 2 | 5 | 0 | 3 |
| 3 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | 1 | 4 | Male | ... | 3.0 | Y | 11 | 8 | 3 | 13.0 | 5 | 8 | 7 | 5 |
| 4 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | 1 | 5 | Male | ... | 4.0 | Y | 12 | 8 | 2 | 9.0 | 2 | 6 | 0 | 4 |
5 rows × 24 columns
## Display the first few rows of the DataFrame...
print(df.head())
Age Attrition BusinessTravel Department DistanceFromHome \ 0 51 No Travel_Rarely Sales 6 1 31 Yes Travel_Frequently Research & Development 10 2 32 No Travel_Frequently Research & Development 17 3 38 No Non-Travel Research & Development 2 4 32 No Travel_Rarely Research & Development 10 Education EducationField EmployeeCount EmployeeID Gender ... \ 0 2 Life Sciences 1 1 Female ... 1 1 Life Sciences 1 2 Female ... 2 4 Other 1 3 Male ... 3 5 Life Sciences 1 4 Male ... 4 1 Medical 1 5 Male ... NumCompaniesWorked Over18 PercentSalaryHike StandardHours \ 0 1.0 Y 11 8 1 0.0 Y 23 8 2 1.0 Y 15 8 3 3.0 Y 11 8 4 4.0 Y 12 8 StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany \ 0 0 1.0 6 1 1 1 6.0 3 5 2 3 5.0 2 5 3 3 13.0 5 8 4 2 9.0 2 6 YearsSinceLastPromotion YearsWithCurrManager 0 0 0 1 1 4 2 0 3 3 7 5 4 0 4 [5 rows x 24 columns]
## Display summary information about the DataFrame...
print(df.info)
<bound method DataFrame.info of Age Attrition BusinessTravel Department \
0 51 No Travel_Rarely Sales
1 31 Yes Travel_Frequently Research & Development
2 32 No Travel_Frequently Research & Development
3 38 No Non-Travel Research & Development
4 32 No Travel_Rarely Research & Development
... ... ... ... ...
4405 42 No Travel_Rarely Research & Development
4406 29 No Travel_Rarely Research & Development
4407 25 No Travel_Rarely Research & Development
4408 42 No Travel_Rarely Sales
4409 40 No Travel_Rarely Research & Development
DistanceFromHome Education EducationField EmployeeCount EmployeeID \
0 6 2 Life Sciences 1 1
1 10 1 Life Sciences 1 2
2 17 4 Other 1 3
3 2 5 Life Sciences 1 4
4 10 1 Medical 1 5
... ... ... ... ... ...
4405 5 4 Medical 1 4406
4406 2 4 Medical 1 4407
4407 25 2 Life Sciences 1 4408
4408 18 2 Medical 1 4409
4409 28 3 Medical 1 4410
Gender ... NumCompaniesWorked Over18 PercentSalaryHike StandardHours \
0 Female ... 1.0 Y 11 8
1 Female ... 0.0 Y 23 8
2 Male ... 1.0 Y 15 8
3 Male ... 3.0 Y 11 8
4 Male ... 4.0 Y 12 8
... ... ... ... ... ... ...
4405 Female ... 3.0 Y 17 8
4406 Male ... 2.0 Y 15 8
4407 Male ... 0.0 Y 20 8
4408 Male ... 0.0 Y 14 8
4409 Male ... 0.0 Y 12 8
StockOptionLevel TotalWorkingYears TrainingTimesLastYear \
0 0 1.0 6
1 1 6.0 3
2 3 5.0 2
3 3 13.0 5
4 2 9.0 2
... ... ... ...
4405 1 10.0 5
4406 0 10.0 2
4407 0 5.0 4
4408 1 10.0 2
4409 0 NaN 6
YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager
0 1 0 0
1 5 1 4
2 5 0 3
3 8 7 5
4 6 0 4
... ... ... ...
4405 3 0 2
4406 3 0 2
4407 4 1 2
4408 9 7 8
4409 21 3 9
[4410 rows x 24 columns]>
## Handle missing values..
df.dropna(inplace=True)
## Remove duplicates...
df.drop_duplicates(inplace=True)
## Standardize formats (e.g converting columns to lowercase)...
df.columns=df.columns.str.lower()
df.head()
| age | attrition | businesstravel | department | distancefromhome | education | educationfield | employeecount | employeeid | gender | ... | numcompaniesworked | over18 | percentsalaryhike | standardhours | stockoptionlevel | totalworkingyears | trainingtimeslastyear | yearsatcompany | yearssincelastpromotion | yearswithcurrmanager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 51 | No | Travel_Rarely | Sales | 6 | 2 | Life Sciences | 1 | 1 | Female | ... | 1.0 | Y | 11 | 8 | 0 | 1.0 | 6 | 1 | 0 | 0 |
| 1 | 31 | Yes | Travel_Frequently | Research & Development | 10 | 1 | Life Sciences | 1 | 2 | Female | ... | 0.0 | Y | 23 | 8 | 1 | 6.0 | 3 | 5 | 1 | 4 |
| 2 | 32 | No | Travel_Frequently | Research & Development | 17 | 4 | Other | 1 | 3 | Male | ... | 1.0 | Y | 15 | 8 | 3 | 5.0 | 2 | 5 | 0 | 3 |
| 3 | 38 | No | Non-Travel | Research & Development | 2 | 5 | Life Sciences | 1 | 4 | Male | ... | 3.0 | Y | 11 | 8 | 3 | 13.0 | 5 | 8 | 7 | 5 |
| 4 | 32 | No | Travel_Rarely | Research & Development | 10 | 1 | Medical | 1 | 5 | Male | ... | 4.0 | Y | 12 | 8 | 2 | 9.0 | 2 | 6 | 0 | 4 |
5 rows × 24 columns
df.dtypes
age int64 attrition object businesstravel object department object distancefromhome int64 education int64 educationfield object employeecount int64 employeeid int64 gender object joblevel int64 jobrole object maritalstatus object monthlyincome int64 numcompaniesworked float64 over18 object percentsalaryhike int64 standardhours int64 stockoptionlevel int64 totalworkingyears float64 trainingtimeslastyear int64 yearsatcompany int64 yearssincelastpromotion int64 yearswithcurrmanager int64 dtype: object
df.shape
(4382, 24)
## Check for missing values...
print(df.isnull().sum())
age 0 attrition 0 businesstravel 0 department 0 distancefromhome 0 education 0 educationfield 0 employeecount 0 employeeid 0 gender 0 joblevel 0 jobrole 0 maritalstatus 0 monthlyincome 0 numcompaniesworked 0 over18 0 percentsalaryhike 0 standardhours 0 stockoptionlevel 0 totalworkingyears 0 trainingtimeslastyear 0 yearsatcompany 0 yearssincelastpromotion 0 yearswithcurrmanager 0 dtype: int64
## 2. Data Exploration
## Compute summary statistics...
print(df.describe())
age distancefromhome education employeecount employeeid \
count 4382.000000 4382.000000 4382.000000 4382.0 4382.000000
mean 36.933364 9.198996 2.912369 1.0 2207.804884
std 9.137272 8.105396 1.024728 0.0 1271.688783
min 18.000000 1.000000 1.000000 1.0 1.000000
25% 30.000000 2.000000 2.000000 1.0 1108.250000
50% 36.000000 7.000000 3.000000 1.0 2208.500000
75% 43.000000 14.000000 4.000000 1.0 3308.750000
max 60.000000 29.000000 5.000000 1.0 4409.000000
joblevel monthlyincome numcompaniesworked percentsalaryhike \
count 4382.000000 4382.000000 4382.000000 4382.000000
mean 2.063898 65061.702419 2.693291 15.210634
std 1.106115 47142.310175 2.497832 3.663007
min 1.000000 10090.000000 0.000000 11.000000
25% 1.000000 29110.000000 1.000000 12.000000
50% 2.000000 49190.000000 2.000000 14.000000
75% 3.000000 83790.000000 4.000000 18.000000
max 5.000000 199990.000000 9.000000 25.000000
standardhours stockoptionlevel totalworkingyears \
count 4382.0 4382.000000 4382.000000
mean 8.0 0.794614 11.290278
std 0.0 0.852397 7.785717
min 8.0 0.000000 0.000000
25% 8.0 0.000000 6.000000
50% 8.0 1.000000 10.000000
75% 8.0 1.000000 15.000000
max 8.0 3.000000 40.000000
trainingtimeslastyear yearsatcompany yearssincelastpromotion \
count 4382.000000 4382.000000 4382.000000
mean 2.798266 7.010497 2.191693
std 1.289402 6.129351 3.224994
min 0.000000 0.000000 0.000000
25% 2.000000 3.000000 0.000000
50% 3.000000 5.000000 1.000000
75% 3.000000 9.000000 3.000000
max 6.000000 40.000000 15.000000
yearswithcurrmanager
count 4382.000000
mean 4.126198
std 3.569674
min 0.000000
25% 2.000000
50% 3.000000
75% 7.000000
max 17.000000
## Explore relationship between variables..
## For example, lets explore the relationship between Age and Monthly Income...
sns.scatterplot(x='age', y='monthlyincome', data=df)
plt.title('age vs. monthlyincome')
plt.show()
## 3. Visualization...
## Visualize the distribution of Attrition...
sns.countplot(x='attrition', data=df)
plt.title('Distribution of Attrition')
plt.show()
## Visualize the distribution of Business Travel...
sns.countplot(x='businesstravel', data=df)
plt.title('Distribution of Business')
plt.show()
## Visualize the distribution of Department...
sns.countplot(x='department', data=df)
plt.title('Distribution of Department')
plt.show()
## Visualize the correlation matrix...
correlation_matrix=df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
import plotly.express as px
## Distribution of Age...
fig=px.histogram(df, x='age', nbins=20, title='Distribution of Age')
fig.show()
## Distribution of Monthly Income...
fig=px.histogram(df, x='monthlyincome', title='Distribution of Monthly Income')
fig.show()
## Relationship between Age and Monthly Income colored by Attrition...
fig=px.scatter(df, x='age', y='monthlyincome', color='attrition', title='Age vs. Monthly Income (Attrition)')
fig.show()
## Relationship between Job Level and Total Working Years colored by Attrition and Business Travel...
fig=px.scatter(df, x='joblevel', y='totalworkingyears', color='attrition', symbol='businesstravel', title='Job Level vs. Total Working Years (Attrition & Business Travel)')
fig.show()
## Box plot of Monthly Income grouped by Department and Attrition...
fig=px.box(df, x='department', y='monthlyincome', color='attrition', title='Monthly Income by Department and Attrition')
fig.show()
## Interactive bar chart of Attrition counts by Marital Status and Gender...
fig=px.bar(df, x='maritalstatus', color='gender', facet_col='attrition', title='Attrition Counts by Marital Status and Gender')
fig.show()
## Descriptive Analysis
## Numeric Variables
numeric_stats=df.describe()
print(numeric_stats)
age distancefromhome education employeecount employeeid \
count 4382.000000 4382.000000 4382.000000 4382.0 4382.000000
mean 36.933364 9.198996 2.912369 1.0 2207.804884
std 9.137272 8.105396 1.024728 0.0 1271.688783
min 18.000000 1.000000 1.000000 1.0 1.000000
25% 30.000000 2.000000 2.000000 1.0 1108.250000
50% 36.000000 7.000000 3.000000 1.0 2208.500000
75% 43.000000 14.000000 4.000000 1.0 3308.750000
max 60.000000 29.000000 5.000000 1.0 4409.000000
joblevel monthlyincome numcompaniesworked percentsalaryhike \
count 4382.000000 4382.000000 4382.000000 4382.000000
mean 2.063898 65061.702419 2.693291 15.210634
std 1.106115 47142.310175 2.497832 3.663007
min 1.000000 10090.000000 0.000000 11.000000
25% 1.000000 29110.000000 1.000000 12.000000
50% 2.000000 49190.000000 2.000000 14.000000
75% 3.000000 83790.000000 4.000000 18.000000
max 5.000000 199990.000000 9.000000 25.000000
standardhours stockoptionlevel totalworkingyears \
count 4382.0 4382.000000 4382.000000
mean 8.0 0.794614 11.290278
std 0.0 0.852397 7.785717
min 8.0 0.000000 0.000000
25% 8.0 0.000000 6.000000
50% 8.0 1.000000 10.000000
75% 8.0 1.000000 15.000000
max 8.0 3.000000 40.000000
trainingtimeslastyear yearsatcompany yearssincelastpromotion \
count 4382.000000 4382.000000 4382.000000
mean 2.798266 7.010497 2.191693
std 1.289402 6.129351 3.224994
min 0.000000 0.000000 0.000000
25% 2.000000 3.000000 0.000000
50% 3.000000 5.000000 1.000000
75% 3.000000 9.000000 3.000000
max 6.000000 40.000000 15.000000
yearswithcurrmanager
count 4382.000000
mean 4.126198
std 3.569674
min 0.000000
25% 2.000000
50% 3.000000
75% 7.000000
max 17.000000
## Categorical Variables...
categorical_cols=df.select_dtypes(include=['object']).columns
for col in categorical_cols:
print(df[col].value_counts())
No 3677 Yes 705 Name: attrition, dtype: int64 Travel_Rarely 3109 Travel_Frequently 825 Non-Travel 448 Name: businesstravel, dtype: int64 Research & Development 2865 Sales 1330 Human Resources 187 Name: department, dtype: int64 Life Sciences 1806 Medical 1385 Marketing 475 Technical Degree 392 Other 244 Human Resources 80 Name: educationfield, dtype: int64 Male 2626 Female 1756 Name: gender, dtype: int64 Sales Executive 975 Research Scientist 872 Laboratory Technician 773 Manufacturing Director 429 Healthcare Representative 389 Manager 305 Sales Representative 246 Research Director 237 Human Resources 156 Name: jobrole, dtype: int64 Married 2007 Single 1405 Divorced 970 Name: maritalstatus, dtype: int64 Y 4382 Name: over18, dtype: int64
## Correlation Matrix
correlation_matrix=df.corr()
## Pair Plot
sns.pairplot(df)
plt.title('Pair Plot')
plt.show()
## Advanced Visualizations
## Scatter Plot Matrix using Plotly
fig=px.scatter_matrix(df, dimensions=['age','monthlyincome','totalworkingyears','yearsatcompany'], color='attrition', title='Scatter Plot Matrix')
fig.show()
## 3D Scatter Plot using Plotly
fig=px.scatter_3d(df, x='age', y='totalworkingyears', z='monthlyincome', color='attrition', title='3D Scatter Plot')
fig.show()
## Box Plot using Seaborn...
plt.figure(figsize=(12, 6))
sns.boxplot(x='department',y='monthlyincome', hue='attrition', data=df)
plt.title('Box Plot of Monthly Income by Department and Attrition')
plt.show()
## Violin Plot using Seaborn
plt.figure(figsize=(12, 6))
sns.violinplot(x='jobrole', y='yearsatcompany', hue='attrition', data=df, split=True)
plt.title('Violin Plot of Years At Company by Job Role and Attrition')
plt.xticks(rotation=45)
plt.show()
## Line Plot using Matplotlib...
plt.figure(figsize=(12, 6))
sns.lineplot(x='yearsatcompany', y='percentsalaryhike', hue='attrition', data=df)
plt.title('Line plot of Percent Salary Hikeover Years At Company by Attrition')
plt.show()
## Advanced descriptive analysis...
## Additional Statistical Measures
## Median
median_values=df.median()
median_values
age 36.0 distancefromhome 7.0 education 3.0 employeecount 1.0 employeeid 2208.5 joblevel 2.0 monthlyincome 49190.0 numcompaniesworked 2.0 percentsalaryhike 14.0 standardhours 8.0 stockoptionlevel 1.0 totalworkingyears 10.0 trainingtimeslastyear 3.0 yearsatcompany 5.0 yearssincelastpromotion 1.0 yearswithcurrmanager 3.0 dtype: float64
## Mode
mode_values=df.mode().iloc[0]
mode_values
age 35 attrition No businesstravel Travel_Rarely department Research & Development distancefromhome 2 education 3 educationfield Life Sciences employeecount 1 employeeid 1 gender Male joblevel 1 jobrole Sales Executive maritalstatus Married monthlyincome 23420 numcompaniesworked 1 over18 Y percentsalaryhike 11 standardhours 8 stockoptionlevel 0 totalworkingyears 10 trainingtimeslastyear 2 yearsatcompany 5 yearssincelastpromotion 0 yearswithcurrmanager 2 Name: 0, dtype: object
## Skewness
skewness=df.skew()
skewness
age 0.413048 distancefromhome 0.955517 education -0.288977 employeecount 0.000000 employeeid -0.002335 joblevel 1.021797 monthlyincome 1.367457 numcompaniesworked 1.029174 percentsalaryhike 0.819510 standardhours 0.000000 stockoptionlevel 0.967263 totalworkingyears 1.115419 trainingtimeslastyear 0.551818 yearsatcompany 1.764619 yearssincelastpromotion 1.980992 yearswithcurrmanager 0.834277 dtype: float64
## Kurtosis
kurtosis=df.kurtosis()
kurtosis
age -0.409517 distancefromhome -0.230691 education -0.565008 employeecount 0.000000 employeeid -1.198607 joblevel 0.388189 monthlyincome 0.990836 numcompaniesworked 0.014307 percentsalaryhike -0.306951 standardhours 0.000000 stockoptionlevel 0.356755 totalworkingyears 0.909316 trainingtimeslastyear 0.494215 yearsatcompany 3.930726 yearssincelastpromotion 3.592162 yearswithcurrmanager 0.170703 dtype: float64
## Quantiles
quantiles=df.quantile([0.25, 0.5, 0.75])
quantiles
| age | distancefromhome | education | employeecount | employeeid | joblevel | monthlyincome | numcompaniesworked | percentsalaryhike | standardhours | stockoptionlevel | totalworkingyears | trainingtimeslastyear | yearsatcompany | yearssincelastpromotion | yearswithcurrmanager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.25 | 30.0 | 2.0 | 2.0 | 1.0 | 1108.25 | 1.0 | 29110.0 | 1.0 | 12.0 | 8.0 | 0.0 | 6.0 | 2.0 | 3.0 | 0.0 | 2.0 |
| 0.50 | 36.0 | 7.0 | 3.0 | 1.0 | 2208.50 | 2.0 | 49190.0 | 2.0 | 14.0 | 8.0 | 1.0 | 10.0 | 3.0 | 5.0 | 1.0 | 3.0 |
| 0.75 | 43.0 | 14.0 | 4.0 | 1.0 | 3308.75 | 3.0 | 83790.0 | 4.0 | 18.0 | 8.0 | 1.0 | 15.0 | 3.0 | 9.0 | 3.0 | 7.0 |
## Identify Outliers
## Box plot
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=['int', 'float']))
plt.title('Box Plot of Numeric Variables')
plt.xticks(rotation=45)
plt.show()
## Scatter plot for Age vs. Monthly Income to identify outliers...
plt.figure(figsize=(8, 6))
sns.scatterplot(x='age', y='monthlyincome', data=df)
plt.title('Scatter Plot of Age vs. Monthly Income')
plt.show()
## Analyze Distributions
## Histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['monthlyincome'], kde=True)
plt.title('Histogram of Monthly Income')
plt.xlabel('Monthly Income')
plt.ylabel('Frequency')
plt.show()
## Kernel Density Estimation Plot...
plt.figure(figsize=(10, 6))
sns.kdeplot(df['totalworkingyears'], shade=True)
plt.title('Kernel Density Estimation Plot of Total Working Years')
plt.xlabel('Total Working Years')
plt.ylabel('Density')
plt.show()
## Q-Q Plot...
import scipy.stats as stats
plt.figure(figsize=(8, 6))
stats.probplot(df['yearsatcompany'], dist="norm", plot=plt)
plt.title('Q-Q Plot of Years At Company')
plt.xlabel('Theoretical Quantiles')
plt.ylabel('Sample Quantiles')
plt.show()
## Employee Turnover Analysis...
## 1. Define employee turnover
## Check the distribution of Attrition
print(df['attrition'].value_counts())
No 3677 Yes 705 Name: attrition, dtype: int64
## 2. Identify potential factors
## Explore relevant features that may influence turnover...
## For example: Age, Monthly Income, Job Role, Job Level, Work-Life Balance, Job Satisfaction, etc...
## 3. Analyze Relationships
## Explore correlations between potential factors and turnover...
## For example: Correlation matrix, Pair plot, Box plots, etc...
## Correlation matrix
correlation_matrix=df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
## Pair plot
sns.pairplot(df, hue='attrition', diag_kind='kde')
plt.title('Pair Plot')
plt.show()
/Users/cactusjack/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:305: UserWarning: Dataset has 0 variance; skipping density estimate. /Users/cactusjack/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:305: UserWarning: Dataset has 0 variance; skipping density estimate. /Users/cactusjack/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:305: UserWarning: Dataset has 0 variance; skipping density estimate. /Users/cactusjack/opt/anaconda3/lib/python3.8/site-packages/seaborn/distributions.py:305: UserWarning: Dataset has 0 variance; skipping density estimate.
## Box plots of potential factors by Attrition...
plt.figure(figsize=(12, 8))
sns.boxplot(x='attrition', y='age', data=df)
plt.title('Box Plot of Age by Attrition')
plt.show()
plt.figure(figsize=(12, 8))
sns.boxplot(x='attrition', y='monthlyincome', data=df)
plt.title('Box Plot of Monthly Income by Attrition')
plt.show()
## 4. Develop retention strategies...
## Based on the analysis, identify factors strongly associated with turnover...
## Develop strategies to address these factors and improve employee retention...
## Example Strategies:
## Offer competitive salaries and benefits...
## Provide opportunities for career development and advancement...
## Improve work-life balance initiatives...
## Enhance Communication and feedback channels...
## Foster a positive organizational culture and employee engagement...
## Predictive Analytics..
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
## Data preprocessing
## Encode categorical variables...
label_encoder=LabelEncoder()
df_encoded=df.copy()
categorical_cols=df_encoded.select_dtypes(include=['object']).columns
for col in categorical_cols:
df_encoded[col]=label_encoder.fit_transform(df_encoded[col])
## Split the dataset into features and target variable...
x=df_encoded.drop('attrition', axis=1)
y=df_encoded['attrition']
## Split the dataset into training and testing sets...
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)
## Model training
## Random Forest Classifier...
rf_classifier= RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(x_train, y_train)
RandomForestClassifier(random_state=42)
## Model evaluation
## Predictions
y_pred=rf_classifier.predict(x_test)
## Accuracy...
accuracy=accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Accuracy: 0.9954389965792474
## Classification report..
print("Classification Report:")
print(classification_report(y_test, y_pred))
Classification Report:
precision recall f1-score support
0 0.99 1.00 1.00 755
1 1.00 0.97 0.98 122
accuracy 1.00 877
macro avg 1.00 0.98 0.99 877
weighted avg 1.00 1.00 1.00 877
## Confusion matrix
conf_matrix=confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix: [[755 0] [ 4 118]]
## Feature importance analysis...
## Define feature importance...
feature_importance=rf_classifier.feature_importances_
feature_importance = rf_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': x.columns, 'Importance':feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)
Feature Importance:
Feature Importance
12 monthlyincome 0.097794
0 age 0.096817
18 totalworkingyears 0.078219
20 yearsatcompany 0.069998
3 distancefromhome 0.066294
15 percentsalaryhike 0.063721
13 numcompaniesworked 0.055151
10 jobrole 0.051456
22 yearswithcurrmanager 0.050315
21 yearssincelastpromotion 0.044419
19 trainingtimeslastyear 0.041117
5 educationfield 0.040899
11 maritalstatus 0.037934
4 education 0.037045
7 employeeid 0.036436
9 joblevel 0.033229
17 stockoptionlevel 0.031500
1 businesstravel 0.027489
2 department 0.023937
8 gender 0.016231
6 employeecount 0.000000
14 over18 0.000000
16 standardhours 0.000000
## Plot feature importance...
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
This report presents the findings of an analysis conducted to understand factors contributing to employee turnover and develop strategies to improve retention. The analysis utilized data on employee demographics, job attributes, and turnover status.
Employee Turnover:
Predictive Model Performance:
Factors Influencing Turnover:
Improve Work-Life Balance:
Offer Career Development Opportunities:
Enhance Communication Channels:
## Distribution of employee turnover (Attrition)
plt.figure(figsize=(8, 6))
attrition_counts=df['attrition'].value_counts()
plt.pie(attrition_counts, labels=attrition_counts.index, autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightcoral'])
plt.title('Distribution of Employee Turnover (Attrition)')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
## Distribution of turnover by job role...
## Check the type of the 'Attrition' column
print(type(df['attrition']))
<class 'pandas.core.series.Series'>
## Check the structure of the DataFrame...
print(df.head())
age attrition businesstravel department distancefromhome \ 0 51 No Travel_Rarely Sales 6 1 31 Yes Travel_Frequently Research & Development 10 2 32 No Travel_Frequently Research & Development 17 3 38 No Non-Travel Research & Development 2 4 32 No Travel_Rarely Research & Development 10 education educationfield employeecount employeeid gender ... \ 0 2 Life Sciences 1 1 Female ... 1 1 Life Sciences 1 2 Female ... 2 4 Other 1 3 Male ... 3 5 Life Sciences 1 4 Male ... 4 1 Medical 1 5 Male ... numcompaniesworked over18 percentsalaryhike standardhours \ 0 1.0 Y 11 8 1 0.0 Y 23 8 2 1.0 Y 15 8 3 3.0 Y 11 8 4 4.0 Y 12 8 stockoptionlevel totalworkingyears trainingtimeslastyear yearsatcompany \ 0 0 1.0 6 1 1 1 6.0 3 5 2 3 5.0 2 5 3 3 13.0 5 8 4 2 9.0 2 6 yearssincelastpromotion yearswithcurrmanager 0 0 0 1 1 4 2 0 3 3 7 5 4 0 4 [5 rows x 24 columns]
if isinstance(df['attrition'], list):
df['attrition'] = pd.Series(df['attrition'])
if isinstance(df['jobrole'], list):
df['jobrole'] = pd.Series(df['jobrole'])
print(df['attrition'].unique())
['No' 'Yes']
plt.figure(figsize=(10, 8))
turnover_by_job_role = df.groupby('jobrole')
['attrition']
['attrition']
<Figure size 720x576 with 0 Axes>
import plotly.graph_objects as go
import plotly_express as px
## Distribution of employee turnover (Attrition)
attrition_counts = df['attrition'].value_counts()
fig = px.pie(values=attrition_counts, names=attrition_counts.index, title='Distribution of Employee Turnover (Attrition)')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
## Distribution of turnover by job role...
## Distribution of employee turnover (Attrition)
attrition_counts = df['attrition'].value_counts()
fig = go.Figure(data=[go.Pie(labels=attrition_counts.index, values=attrition_counts)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent+label', textfont_size=15, marker=dict(colors=['#66c2a5', '#fc8d62']))
fig.update_layout(title='Distribution of Employee Turnover (Attrition)', title_x=0.5)
fig.show()
## Distribution of turnover by job role...
plt.figure(figsize=(15, 10))
turnover_by_job_role = df.groupby('jobrole')['attrition'].value_counts().unstack().fillna(0)
<Figure size 1080x720 with 0 Axes>
## Plot pie charts for each job role...
fig = go.Figure()
## Add pie chat traces for each job role...
[fig.add_trace(go.Pie(labels=turnover_by_job_role.index, values=turnover_by_job_role[column], name=column)) for column in turnover_by_job_role.columns]
[Figure({
'data': [{'labels': array(['Healthcare Representative', 'Human Resources', 'Laboratory Technician',
'Manager', 'Manufacturing Director', 'Research Director',
'Research Scientist', 'Sales Executive', 'Sales Representative'],
dtype=object),
'name': 'No',
'type': 'pie',
'values': array([333, 135, 648, 263, 381, 183, 714, 810, 210])},
{'labels': array(['Healthcare Representative', 'Human Resources', 'Laboratory Technician',
'Manager', 'Manufacturing Director', 'Research Director',
'Research Scientist', 'Sales Executive', 'Sales Representative'],
dtype=object),
'name': 'Yes',
'type': 'pie',
'values': array([ 56, 21, 125, 42, 48, 54, 158, 165, 36])}],
'layout': {'grid': {'columns': 2}, 'template': '...', 'title': {'text': 'Distribution of Turnover by Job Role'}}
}),
Figure({
'data': [{'labels': array(['Healthcare Representative', 'Human Resources', 'Laboratory Technician',
'Manager', 'Manufacturing Director', 'Research Director',
'Research Scientist', 'Sales Executive', 'Sales Representative'],
dtype=object),
'name': 'No',
'type': 'pie',
'values': array([333, 135, 648, 263, 381, 183, 714, 810, 210])},
{'labels': array(['Healthcare Representative', 'Human Resources', 'Laboratory Technician',
'Manager', 'Manufacturing Director', 'Research Director',
'Research Scientist', 'Sales Executive', 'Sales Representative'],
dtype=object),
'name': 'Yes',
'type': 'pie',
'values': array([ 56, 21, 125, 42, 48, 54, 158, 165, 36])}],
'layout': {'grid': {'columns': 2}, 'template': '...', 'title': {'text': 'Distribution of Turnover by Job Role'}}
})]
## Update pie chat layout...
fig.update_traces(textposition='inside', textinfo='percent+label', hole=.3)
fig.update_layout(title='Distribution of Turnover by Job Role', title_x=0.5, grid=dict(columns=2))
fig.show()
## Calculate average job satisfaction rating by gender and age group...
print(df.head())
age attrition businesstravel department distancefromhome \ 0 51 No Travel_Rarely Sales 6 1 31 Yes Travel_Frequently Research & Development 10 2 32 No Travel_Frequently Research & Development 17 3 38 No Non-Travel Research & Development 2 4 32 No Travel_Rarely Research & Development 10 education educationfield employeecount employeeid gender ... \ 0 2 Life Sciences 1 1 Female ... 1 1 Life Sciences 1 2 Female ... 2 4 Other 1 3 Male ... 3 5 Life Sciences 1 4 Male ... 4 1 Medical 1 5 Male ... numcompaniesworked over18 percentsalaryhike standardhours \ 0 1.0 Y 11 8 1 0.0 Y 23 8 2 1.0 Y 15 8 3 3.0 Y 11 8 4 4.0 Y 12 8 stockoptionlevel totalworkingyears trainingtimeslastyear yearsatcompany \ 0 0 1.0 6 1 1 1 6.0 3 5 2 3 5.0 2 5 3 3 13.0 5 8 4 2 9.0 2 6 yearssincelastpromotion yearswithcurrmanager 0 0 0 1 1 4 2 0 3 3 7 5 4 0 4 [5 rows x 24 columns]
## Calculate descriptive statistics for job satisfaction rating...
job_satisfaction_stats = df['jobrole'].describe()
## Calculate count of employees by gender...
gender_counts = df['gender'].value_counts()
age_groups = pd.cut(df['age'], bins=[18, 30, 40, 50, 60, 70], labels=['18-29', '30-39', '40-49', '50-59', '60+'])
age_groups_counts = age_groups.value_counts()
## Print descriptive statistics
print("Job Satisfaction Statistics")
Job Satisfaction Statistics
print(job_satisfaction_stats)
count 4382 unique 9 top Sales Executive freq 975 Name: jobrole, dtype: object
print("\nGender Counts:")
Gender Counts:
print(gender_counts)
Male 2626 Female 1756 Name: gender, dtype: int64
print("\nAge Group Counts")
Age Group Counts
print(age_groups_counts)
30-39 1844 18-29 1126 40-49 962 50-59 427 60+ 0 Name: age, dtype: int64
## Create histogram for job satisfaction rating...
fig_job_satisfaction = px.histogram(df, x='jobrole', title='Distribution of Job Satisfaction Rating')
## Create bar chart for gender counts...
fig_gender_counts = px.bar(x=gender_counts.index, y=gender_counts.values, labels={'x':'Gender', 'y': 'Count'}, title='Count of Employees by Gender')
## Create bar chart for age group counts...
fig_age_group_counts = px.bar(x=age_groups_counts.index, y=age_groups_counts.values, labels={'x':'Age Group', 'y': 'Count'}, title='Count of Employees by Age Group')
## Show Visualizations...
fig_job_satisfaction.show()
fig_gender_counts.show()
fig_age_group_counts.show()